suppressPackageStartupMessages({
  import(rpkgs)
})

import(run)
import(util)
## [1] TRUE

Baseline model predict target = average value for the asset

Model

modelName = "baseline-avg"

assets = getAllAssets()
## 2021-12-01 03:19:56 INFO::Sourcing ALL_ASSETS
runModel = \() {
  doRun(
    name = modelName,
    trnAmt = 60 * 24 * 7 * 1, # 1 week of data, chosen arbitrarily
    tstAmt = 60 * 24 * 7 * 2, # 2 weeks, submission period will provide new data every 2 weeks
    assets = assets[,asset_id],

    makeData = \(env, minDate, maxDate, assets, ...) {
      selectStmt = glue('
        SELECT ts, asset_id, asset_name, target
        FROM trn
        WHERE (ts BETWEEN $1 AND $2)
          AND asset_id IN ({paste(assets, collapse = ", ")})
      ')

      df = getQuery(selectStmt, params = list(minDate, maxDate))
      
      env$x = df[,.(ts, asset_id, asset_name)]
      env$y = df[,.(target)]
    },

    trainModel = \(model, trn, ...) {
      # give the model a description
      model$description = 'mean of target'

      model$getKeyForAsset = \(a) paste("asset-", a)
      for (a in unique(trn$x[,asset_id])) {
        idx = trn$x[,asset_id] == a
        key = model$getKeyForAsset(a)
        prediction = mean(trn$y[idx,target], na.rm = TRUE)
        if (is.na(prediction)) prediction = 0
        model[[key]] = prediction
      }
    },

    predictModel = \(model, tst, ...) {
      # use advanced machine learning algorithm to predict crypto movement
      tst$yhat = vector(mode = "numeric", length = nrow(tst$x))
      tst$yhat[1:length(tst$yhat)] <- NA
  
      for (a in unique(tst$x[,asset_id])) {
        idx = tst$x[,asset_id] == a
        key = model$getKeyForAsset(a)
        tst$yhat[idx] <- model[[key]]
      }
    }
  )
}

Method

Same method as was used for the baseline “target = 0” model.

numSamples = 610

set.seed(205794)

for (i in 1:numSamples) {
  results = runModel()
}

Plots

We can examine the results from the last run, as a sanity-check.

df = results$tst$x
df$y = results$tst$y$target
df$yhat = results$tst$yhat

set.seed(68420)

# sample of data
plotStart = sample(df[,ts], 1)
plotEnd = plotStart + as.difftime(200, units = "mins")

assets[sample(nrow(assets), 2),asset_name] |>
  lapply(\(asset) {
    df[asset_name == asset & ts > plotStart & ts < plotEnd] |>
      melt(id.vars = c("ts", "asset_name"), measure.vars = c("y", "yhat")) |>
      ggplot(aes(ts, value, colour = variable)) +
      geom_line() +
      facet_wrap(~asset_name, ncol = 1)
  }) |>
  print()
## Warning: Removed 7 row(s) containing missing values (geom_path).

The competition metric is correlation between your predictions and the targets.

Visualising this:

## Warning: Removed 62809 rows containing non-finite values (stat_bin2d).

Remember, that’s just for 1 run; we repeated that experiment 610 times!

Evaluation

scores = getQuery('SELECT * FROM metrics WHERE name = $1', params = list(modelName))
DT::datatable(scores[,.(run_id, corr, mae, aae, rmse)])
performancePlot = \(x) {
  # https://stackoverflow.com/a/36344354

  count = sum(!is.na(x))
  mean = mean(x, na.rm = TRUE)
  sd = sd(x, na.rm = TRUE)
  sem = sd / sqrt(count)
  range = (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
  breaks = 30
  binwidth = range / breaks

  labelY = max(hist(x, breaks = breaks, plot = FALSE)$counts)

  makePlot = \(plotRange) {
    if (plotRange == "relative") {
      curveX = linspace(mean - range/2, mean + range/2, 100)
      labelX = mean + range/2
    } else if (plotRange == "absolute") {
      curveX = linspace(-0.05, 1.0, 1000)
      labelX = 1.0
    }

    annFmt = \(n) format(round(n, 5), nsmall=5)
    mcorrAnn = annotate(
      "text",
      label = glue('mean: {annFmt(mean)}\ns.err: {annFmt(sem)}\ns.dev: {annFmt(sd)}'),
      family = "monospace",
      hjust = "right",
      vjust = "top",
      x = labelX, y = labelY
    )

    curveY = dnorm(curveX, mean = mean, sd = sd) * binwidth * count
    curveD = data.table(
      corr = curveX,
      count = curveY
    )

    ggplot(data.frame(x = x), aes(x)) +
      xlab("corr") +
      ylab("count") +
      geom_histogram(binwidth = binwidth) +
      geom_line(data = curveD, aes(x = corr, y = count)) +
      geom_vline(xintercept = mean, color = "green") +
      geom_vline(xintercept = mean - 2.5*sem, color = "green", alpha = 0.5) +
      geom_vline(xintercept = mean + 2.5*sem, color = "green", alpha = 0.5) +
      geom_vline(xintercept = mean - 2.5*sd, color = "blue", alpha = 0.5) +
      geom_vline(xintercept = mean + 2.5*sd, color = "blue", alpha = 0.5) +
      mcorrAnn +
      labs(
        title = glue('Distribution of corr ({plotRange})')
      )
  }

  results = new.env(parent = .GlobalEnv)
  results$relativePlot = makePlot("relative")
  results$absolutePlot = makePlot("absolute")

  results$count = count
  results$mean = mean
  results$sd = sd
  results$sem = sem
  results$range = range
  results$binwidth = binwidth
  results$shapiro.test = shapiro.test(x)
  results$t.test = t.test(x)

  class(results) <- append(class(results), "performancePlot")
  results
}

format.performancePlot = \(p) {
  sw = paste(capture.output(print(p$shapiro.test)), collapse = "\n")
  tt = paste(capture.output(print(p$t.test)), collapse = "\n")
  glue('performancePlot:
       count = {p$count}
        mean = {p$mean}
          sd = {p$sd}
         sem = {p$sem}
       range = {p$range}
    binwidth = {p$binwidth}
relativePlot = <ggplot plot object>
absolutePlot = <ggplot plot object>
      t.test = {tt}')
}

print.performancePlot = \(p, ...) cat(format(p), ...)

p = performancePlot(scores[,corr])
print(p)
## performancePlot:
##        count = 610
##         mean = 0.00112118099786689
##           sd = 0.00900130400299388
##          sem = 0.000364452146118459
##        range = 0.069905162
##     binwidth = 0.00233017206666667
## relativePlot = <ggplot plot object>
## absolutePlot = <ggplot plot object>
##       t.test = 
##  One Sample t-test
## 
## data:  x
## t = 3.0763, df = 609, p-value = 0.00219
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.0004054455 0.0018369165
## sample estimates:
##   mean of x 
## 0.001121181

Absolute range performance plot

Relative range performance plot

Interactive plots

Normality checks

## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.97935, p-value = 1.405e-07